#!/bin/bash
# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
#                      Arnab Ghoshal
#                2014  Guoguo Chen
#                2015  Hainan Xu

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This script prepares a directory such as data/lang/, in the standard format,
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phoneN
# per line (alternate prons would be separate lines), or a dictionary with probabilities
# called lexiconp.txt in a form:
# word pron-prob phone1 phone2 ... phoneN
# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
# lexicon.txt exists.
# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
# and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of 
# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the 
# "real" phones.)
# In each line of those files is a list of phones, and the phones on each line 
# are assumed to correspond to the same "base phone", i.e. they will be 
# different stress or tone variations of the same basic phone.
# The file "optional_silence.txt" contains just a single phone (typically SIL) 
# which is used for optional silence in the lexicon.
# extra_questions.txt might be empty; typically will consist of lists of phones,
# all members of each list with the same stress or tone; and also possibly a 
# list for the silence phones.  This will augment the automtically generated 
# questions (note: the automatically generated ones will treat all the 
# stress/tone versions of a phone the same, so will not "get to ask" about 
# stress or tone).

# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.

# Begin configuration section.
num_sil_states=5
num_nonsil_states=3
position_dependent_phones=true
# position_dependent_phones is false also when position dependent phones and word_boundary.txt 
# have been generated by another source
reverse=false
share_silence_phones=false  # if true, then share pdfs of different silence 
                            # phones together.
sil_prob=0.5
phone_symbol_table=              # if set, use a specified phones.txt file.
# end configuration sections

. utils/parse_options.sh 

if [ $# -ne 4 ]; then 
  echo "usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
  echo "See http://kaldi.sourceforge.net/data_prep.html#data_prep_lang_creating for more info."
  echo "options: "
  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
  echo "                                                     # markers on phones to indicate word-internal positions. "
  echo "     --reverse (true|false)                          # reverse lexicon."
  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
  echo "                                                     # all non-silence phones. "
  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
  echo "     --phone-symbol-table <filename>                 # default: \"\"; if not empty, use the provided "
  echo "                                                     # phones.txt as phone symbol table. This is useful "
  echo "                                                     # if you use a new dictionary for the existing setup."
  exit 1;
fi

srcdir=$1
oov_word=$2
tmpdir=$3
dir=$4
mkdir -p $dir $tmpdir $dir/phones

silprob=false
[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true

[ -f path.sh ] && . ./path.sh

! utils/validate_dict_dir.pl $srcdir && \
  echo "*Error validating directory $srcdir*" && exit 1;

if [[ ! -f $srcdir/lexicon.txt ]]; then
  echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
fi
if [[ ! -f $srcdir/lexiconp.txt ]]; then
  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
fi

if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
  utils/validate_dict_dir.pl $srcdir  # show the output.
  echo "Validation failed (second time)"
  exit 1;
fi

# phones.txt file provided, we will do some sanity check here.
if [[ ! -z $phone_symbol_table ]]; then
  # Checks if we have position dependent phones
  n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
  n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
  $position_dependent_phones && [ $n1 -eq $n2 ] &&\
    echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1;
  ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\
      echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1;

  # Checks if the phone sets match.
  cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
  { for (x = 1; x <= NF; ++x) { if (!($x in phones)) {
      print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1;
fi

if $position_dependent_phones; then
  # Create $tmpdir/lexicon.original from $srcdir/lexicon.txt by
  # adding the markers _B, _E, _S, _I depending on word position.
  # In this recipe, these markers apply to silence also.
  # Do this starting from lexiconp.txt only.
  if "$silprob"; then 
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } 
         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
    if $reverse; then
      echo "We do not support reverse option and silprob at the same time"
      exit 1
    fi
  else
    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
         if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
           < $srcdir/lexiconp.txt > $tmpdir/lexiconp.pre_reverse || exit 1;
    if $reverse; then
      echo "reversing lexicon."
      cat $tmpdir/lexiconp.pre_reverse \
        | awk '{printf "%s %s ",$1, $2;for(i=NF;i>2;i--){printf "%s ",$i;}printf "\n"}' \
        > $tmpdir/lexiconp.txt
    else
      mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt
    fi
  fi
  
  # create $tmpdir/phone_map.txt
  # this has the format (on each line)
  # <original phone> <version 1 of original phone> <version 2> ...
  # where the versions depend on the position of the phone within a word. 
  # For instance, we'd have:
  # AA AA_B AA_E AA_I AA_S
  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
  # and in the case of silence
  # SIL SIL SIL_B SIL_E SIL_I SIL_S
  # [because SIL on its own is one of the variants; this is for when it doesn't
  #  occur inside a word but as an option in the lexicon.]

  # This phone map expands the phone lists into all the word-position-dependent
  # versions of the phone lists.

  cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
    > $tmpdir/phone_map.txt
else
  if "$silprob"; then 
    cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
    if $reverse; then
      echo "We do not support reverse option and silprob at the same time"
      exit 1
    fi
  else
    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.pre_reverse
  fi

  if $reverse; then
    echo "reversing lexicon."
    cat $tmpdir/lexiconp.pre_reverse \
      | awk '{printf "%s %s ",$1, $2;for(i=NF;i>2;i--){printf "%s ",$i;}printf "\n"}' \
      > $tmpdir/lexiconp.txt
  else
    mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt
  fi

  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
    sed 's/ /\n/g' | awk '(NF>0){print}' > $tmpdir/phones
  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
fi

mkdir -p $dir/phones  # various sets of phones...

# Sets of phones for use in clustering, and making monophone systems.

if $share_silence_phones; then
  # build a roots file that will force all the silence phones to share the
  # same pdf's. [three distinct states, only the transitions will differ.]
  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
  # in the same tree-root?
  # Sharing across models(phones) is achieved by writing several phones
  # into one line of roots.txt (shared/not-shared doesn't affect this).
  # 'not-shared not-split' means we have separate tree roots for the 3 states,
  # but we never split the tree so they remain stumps,
  # so all phones in the line correspond to the same model.

  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | \
    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
else
  # different silence phones will have different GMMs.  [note: here, all "shared split" means
  # is that we may have one GMM for all the states, or we can split on states.  because they're
  # context-independent phones, they don't see the context.]
  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
fi

cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
cp $dir/phones/silence.txt $dir/phones/context_indep.txt

# if extra_questions.txt is empty, it's OK.
cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
  >$dir/phones/extra_questions.txt

# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence. Probably doesn't matter, as silence will rarely
# be inside a word.
if $position_dependent_phones; then
  for suffix in _B _E _I _S; do
    (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
  for suffix in "" _B _E _I _S; do
    (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
  done
fi

# add disambig symbols to the lexicon in $tmpdir/lexiconp.txt
# and produce $tmpdir/lexicon_*disambig.txt

if "$silprob"; then
  ndisambig=`utils/add_lex_disambig.pl --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt`
else
  ndisambig=`utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
fi
ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
echo $ndisambig > $tmpdir/lex_ndisambig

# Format of lexiconp_disambig.txt:
# !SIL	1.0   SIL_S
# <SPOKEN_NOISE>	1.0   SPN_S #1
# <UNK>	1.0  SPN_S #2
# <NOISE>	1.0  NSN_S
# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

# Create phone symbol table.
if [[ ! -z $phone_symbol_table ]]; then
  start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
  BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt 
else
  echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
    awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
fi

# Create a file that describes the word-boundary information for
# each phone.  5 categories.
if $position_dependent_phones; then
  cat $dir/phones/{silence,nonsilence}.txt | \
    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
         {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
else
  # word_boundary.txt might have been generated by another source
  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt
fi

# Create word symbol table.
# <s> and </s> are only needed due to the need to rescore lattices with
# ConstArpaLm format language model. They do not normally appear in G.fst or
# L.fst.

if "$silprob"; then
  # remove the silprob
  cat $tmpdir/lexiconp_silprob.txt |\
    awk '{
      for(i=1; i<=NF; i++) {
        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
      }
    }' > $tmpdir/lexiconp.txt
fi

cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
  BEGIN {
    print "<eps> 0";
  } 
  {
    if ($1 == "<s>") {
      print "<s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    if ($1 == "</s>") {
      print "</s> is in the vocabulary!" | "cat 1>&2"
      exit 1;
    }
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;

# format of $dir/words.txt:
#<eps> 0
#!EXCLAMATION-POINT 1
#!SIL 2
#"CLOSE-QUOTE 3
#...

silphone=`cat $srcdir/optional_silence.txt` || exit 1;
[ -z "$silphone" ] && \
  ( echo "You have no optional-silence phone; it is required in the current scripts"
    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
   exit 1;

# create $dir/phones/align_lexicon.{txt,int}.
# This is the new-new style of lexicon aligning.

# First remove pron-probs from the lexicon.
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt

# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
# and is not part of a word.
[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt

cat $tmpdir/align_lexicon.txt | \
 perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt

# create phones/align_lexicon.int
cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int

# Create the basic L.fst without disambiguation symbols, for use
# in training. 

if $silprob; then
  # Usually it's the same as having a fixed-prob L.fst
  # it matters a little bit in discriminative trainings
  utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
     sed 's=\#[0-9][0-9]*=<eps>=g' | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
     --keep_isymbols=false --keep_osymbols=false |   \
     fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
  utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \
    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
    --keep_isymbols=false --keep_osymbols=false | \
     fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi

# The file oov.txt contains a word that we will map any OOVs to during
# training.
echo "$oov_word" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
# integer version of oov symbol, used in some scripts.


# Create these lists of phones in colon-separated integer list form too, 
# for purposes of being given to programs as command-line options.
for f in silence nonsilence optional_silence disambig context_indep; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done

for x in sets extra_questions; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
done

utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
   > $dir/phones/roots.int || exit 1;

#if $position_dependent_phones; then
if [ -f $dir/phones/word_boundary.txt ]; then
  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
    > $dir/phones/word_boundary.int || exit 1;
fi

silphonelist=`cat $dir/phones/silence.csl`
nonsilphonelist=`cat $dir/phones/nonsilence.csl`
utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo


# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
# There is an extra step where we create a loop to "pass through" the
# disambiguation symbols from G.fst.
phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`

if $silprob; then
  utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
     --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
else
  utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
     --keep_isymbols=false --keep_osymbols=false |   \
     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi


echo "$(basename $0): validating output directory"
! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;

exit 0;

